{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import spacy\n",
    "import numpy as np\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.layers import Dense, Input, Flatten, Embedding\n",
    "from keras.models import Model, Sequential\n",
    "from keras.layers.recurrent import LSTM\n",
    "from keras.layers.core import RepeatVector, Dense, Dropout, Activation\n",
    "from keras.layers.wrappers import TimeDistributed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "nlp = spacy.load('en')\n",
    "nlp.vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# sequence length; shorter sequences are padded with zeros\n",
    "MAX_SEQ_LEN = 20\n",
    "# default from glove embedding vectors\n",
    "EMBEDDING_DIM = 300\n",
    "# limits to number of words\n",
    "MAX_NB_WORDS = 10000\n",
    "# train/dev split\n",
    "DEV_SPLIT = 0.1\n",
    "\n",
    "dropout = 0.5\n",
    "input_depth = 1\n",
    "output_depth = 1\n",
    "input_dim = 128\n",
    "output_dim = 128\n",
    "depth = (input_depth, output_depth)\n",
    "hidden_dim = (input_dim, output_dim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10001, 300)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_embeddings(vocab):\n",
    "    \"\"\"\n",
    "    get embeddings from spacy's glove vectors\n",
    "    \"\"\"\n",
    "    max_rank = MAX_NB_WORDS\n",
    "    # add 1 to array so we can handle <UNK> words\n",
    "    vectors = np.ndarray((max_rank + 1, vocab.vectors_length), dtype='float32')\n",
    "    for lex in vocab:\n",
    "        if lex.has_vector and lex.rank < MAX_NB_WORDS:\n",
    "            vectors[lex.rank] = lex.vector\n",
    "    return vectors\n",
    "\n",
    "embeddings = get_embeddings(nlp.vocab)\n",
    "embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tensor(\"Input_6:0\", shape=(?, 20), dtype=int32)\n",
      "Tensor(\"Embedding_5/Gather:0\", shape=(?, 20, 300), dtype=float32)\n"
     ]
    }
   ],
   "source": [
    "sequence_input = Input(shape=(MAX_SEQ_LEN, ), dtype='int32', name='Input')\n",
    "print(sequence_input)\n",
    "sequence_embeded = Embedding(input_dim=embeddings.shape[0], output_dim=EMBEDDING_DIM, \n",
    "                             weights=[embeddings], input_length=MAX_SEQ_LEN, \n",
    "                             trainable=False, name='Embedding')(sequence_input)\n",
    "print(sequence_embeded)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# encoder\n",
    "encoder = LSTM(hidden_dim[0], return_sequences=True,\n",
    "               activation='relu', name='Encoder1')(sequence_embeded)\n",
    "for _ in range(0, depth[0]):\n",
    "    encoder = LSTM(hidden_dim[0], return_sequences=False,\n",
    "                   activation='relu', name='Encoder%d' % _)(encoder)\n",
    "    encoder = Dropout(dropout, name='EnDropout%d' % _)(encoder)\n",
    "\n",
    "# thought vector\n",
    "thought_vector = RepeatVector(MAX_SEQ_LEN, name='C')(encoder)\n",
    "\n",
    "# decoder\n",
    "decoder = LSTM(hidden_dim[1], return_sequences=True,\n",
    "               activation='relu', name='Decoder1')(thought_vector)\n",
    "for _ in range(0, depth[1]):\n",
    "    decoder = LSTM(hidden_dim[1], return_sequences=True,\n",
    "                   activation='relu', name='Decoder%d' % _)(decoder)\n",
    "    decoder = Dropout(dropout, name='DeDropout%d' % _)(decoder)\n",
    "\n",
    "preds = TimeDistributed(\n",
    "    Dense(embeddings.shape[0], activation='softmax'))(decoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "Input (InputLayer)           (None, 20)                0         \n",
      "_________________________________________________________________\n",
      "Embedding (Embedding)        (None, 20, 300)           3000300   \n",
      "_________________________________________________________________\n",
      "Encoder1 (LSTM)              (None, 20, 128)           219648    \n",
      "_________________________________________________________________\n",
      "Encoder0 (LSTM)              (None, 128)               131584    \n",
      "_________________________________________________________________\n",
      "EnDropout0 (Dropout)         (None, 128)               0         \n",
      "_________________________________________________________________\n",
      "C (RepeatVector)             (None, 20, 128)           0         \n",
      "_________________________________________________________________\n",
      "Decoder1 (LSTM)              (None, 20, 128)           131584    \n",
      "_________________________________________________________________\n",
      "Decoder0 (LSTM)              (None, 20, 128)           131584    \n",
      "_________________________________________________________________\n",
      "DeDropout0 (Dropout)         (None, 20, 128)           0         \n",
      "_________________________________________________________________\n",
      "time_distributed_8 (TimeDist (None, 20, 10001)         1290129   \n",
      "=================================================================\n",
      "Total params: 4,904,829.0\n",
      "Trainable params: 1,904,529.0\n",
      "Non-trainable params: 3,000,300.0\n",
      "_________________________________________________________________\n",
      "None\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/chenzomi/anaconda/lib/python3.6/site-packages/keras/legacy/interfaces.py:86: UserWarning: Update your `Model` call to the Keras 2 API: `Model(inputs=Tensor(\"In..., outputs=Tensor(\"ti...)`\n",
      "  '` call to the Keras 2 API: ' + signature)\n"
     ]
    }
   ],
   "source": [
    "model = Model(input=sequence_input, output=preds)\n",
    "model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n",
    "print(model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
